# Libraries for parsing data
import os
import pandas as pd
import xml.etree.ElementTree as ET
from lxml import etree
from bs4 import BeautifulSoup
import re
import numpy as np
path_dropbox = "D:/Dropbox/Research/China Foreign Share Discount"
def callConversation(path, filename):
tree = ET.parse(os.path.join(path, filename))
root = tree.getroot()
# from attributes
eventId = root.attrib['Id']
# participant list
text = root[0][1].text
pattern=r"\s*-{10,}\s*(?P<speaker>.+?)\[(?P<index>\d+)\]\s*-{10,}\s*(?P<content>.+?)(?=\s*-{10,}|\Z)"
matches = re.findall(pattern, text, re.DOTALL)
# write results into list
speech = []
for match in matches:
speaker, index, content = match
speech.append({
'eventId': eventId,
'speaker': speaker.strip(),
'speechId': index.strip(),
'content': content.strip()
})
return speech
speeches = []
for year in range(2001, 2024):
path = "E:/Transcripts/" + str(year)
for filename in os.listdir(path):
speech = callConversation(path, filename)
speeches += speech
print(str(year) + ' is done!!!')
2001 is done!!! 2002 is done!!! 2003 is done!!! 2004 is done!!! 2005 is done!!! 2006 is done!!! 2007 is done!!! 2008 is done!!! 2009 is done!!! 2010 is done!!! 2011 is done!!! 2012 is done!!! 2013 is done!!! 2014 is done!!! 2015 is done!!! 2016 is done!!! 2017 is done!!! 2018 is done!!! 2019 is done!!! 2020 is done!!! 2021 is done!!! 2022 is done!!! 2023 is done!!!
dfSpeech = pd.DataFrame(speeches)
import pickle
with open(path_dropbox + '/Conference Call Transcript/conversation.pkl', 'wb') as file:
pickle.dump(dfSpeech, file)